package org.hscoder.websearcher;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.io.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.stream.Collectors;

/**
 * 最简单的网页抓取演示
 */
public class NetFetcher {

    /**
     * 通过URLConnection抓取
     * @throws IOException
     */
    private static void fetchByURL() throws IOException {
        URL url = new URL("http://www.baidu.com");

        URLConnection connection = url.openConnection();
        String contentType = connection.getContentType();
        long contentLength = connection.getContentLength();

        String body = new BufferedReader(new InputStreamReader(connection.getInputStream()))
                .lines().collect(Collectors.joining(System.lineSeparator()));

        System.out.println("contentType:" + contentType);
        System.out.println("contentLength:" + contentLength);
        System.out.println("body:\n" + body);
    }

    /**
     * 通过Jsoup 抓取
     * @throws IOException
     */
    private static void fetchByJsoup() throws IOException {
        Document document = Jsoup.parse(new URL("http://www.baidu.com"), 15000);
        System.out.println("标题：" + document.title());

        document.getElementsByTag("a").stream().forEach(e -> {
            System.out.println(e.text() + ":" + e.absUrl("href"));
        });
    }

    public static void main(String[] args) throws IOException {
        fetchByJsoup();
    }
}
